Business Problem

The situation of Airbnb houses demand in Boston and Beijing. Compared it with the demand of hotels on Booking.com to futher discover the influence of competitor to Airbnb.

Data Sources

Boston and Beijing Airbnb data: http://insideairbnb.com/get-the-data.html Boston and Beijing Booking data: web crawler

Inside Airbnb is an non-commercial and independent dataset of Airbnb global demand, which includes the first-hand data of Airbnb booking history and reviews of users.

Preprocessing Steps

Set up environment

library(tidyverse)
library(magrittr)
library(readr)
library(lubridate)
library(ggthemes)
library(leaflet)
bj_list <- read_csv('BJlistings.csv')
bj_reviews <- read_csv('BJreviews.csv')
bj_calendar <- read_csv('BJcalendar.csv')

bos_list <- read_csv('Boslistings.csv')
bos_reviews <- read_csv('Bosreviews.csv')
bos_calendar <- read_csv('Boscalendar.csv')

Primary Variables Removal

1. remove non neccessary and repeated columns

# step 1
# remove all url, which are NAs
bos_list_NAomit <- bos_list %>% 
  select(-ends_with("_url"), -experiences_offered,
         -host_total_listings_count,-calendar_last_scraped, -country)

bj_list_NAomit <- bj_list %>% 
  select(-ends_with("_url"), -experiences_offered,
         -host_total_listings_count,-calendar_last_scraped, -country)

# step 2
# remove needless columns
bj_list_rmneedlss <- bj_list_NAomit[ ,-c(2:3)]
bos_list_rmneedless <- bos_list_NAomit[,-c(2:3)]

# step 3
# remove columns with over 60% NA observations
bj_list_cleaned <- bj_list_rmneedlss[, -which(colMeans(is.na(bj_list_rmneedlss)) > 0.8)]
bos_list_cleaned <- bos_list_rmneedless[, -which(colMeans(is.na(bos_list_rmneedless)) > 0.6)]

# View the columns we removed for step3
bos_removed_col <- apply(bos_list_rmneedless, 2, function(col)sum(is.na(col))/length(col))
bos_removed_col[bos_removed_col > 0.6]
## neighbourhood_group_cleansed                  square_feet 
##                    1.0000000                    0.9646421 
##                 weekly_price                monthly_price 
##                    0.9167380                    0.9155974
bj_removed_col <- apply(bj_list_rmneedlss, 2, function(col)sum(is.na(col))/length(col))
bj_removed_col[bj_removed_col>0.8]
## neighbourhood_group_cleansed                  square_feet 
##                    1.0000000                    0.9995878 
##                 weekly_price                monthly_price 
##                    0.9915752                    0.9914206 
##                      license           jurisdiction_names 
##                    0.8973566                    0.9999742
# step 3 removed for beijing & boston listing: neighbourhood_group_cleansed, square_feet, weekly_price, monthly_price

# special removal only for beijingllisting: license, jurisdiction_names 

# though high NA rate but will not be directly removed for beijing listing:
# cleaning_fee,security_deposit (over 60% NA rate)

2. remove $ sign for security_deposit, price, and cleaning_fee

# eliminate $ sign 
bj_list_cleaned$security_deposit = gsub("\\$", "", bj_list_cleaned$security_deposit) %>% as.numeric()
bj_list_cleaned$price <- gsub("\\$", "", bj_list_cleaned$price) %>% as.numeric()
bj_list_cleaned$cleaning_fee <- gsub("\\$", "", bj_list_cleaned$cleaning_fee) %>% as.numeric()

bos_list_cleaned$security_deposit <- gsub("\\$", "", bos_list_cleaned$security_deposit) %>% as.numeric()
bos_list_cleaned$price <- gsub("\\$", "", bos_list_cleaned$price) %>% as.numeric()
bos_list_cleaned$cleaning_fee <- gsub("\\$", "", bos_list_cleaned$cleaning_fee) %>% as.numeric()

# $ sign in calendar data
bos_calendar$price <- gsub("\\$", "", bos_calendar$price) %>% as.numeric()
bj_calendar$price <- gsub("\\$", "", bj_calendar$price) %>% as.numeric()

summary(bos_calendar$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    10.0    80.0   144.0   192.6   232.0   999.0   24144
summary(bj_calendar$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0   246.0   371.0   406.1   528.0   999.0 1756475

3. remove % sign for host_response_rate

bos_list_cleaned$host_response_rate <- sapply(bos_list_cleaned$host_response_rate,
                                              function(x) gsub('%','',x))
bj_list_cleaned$host_response_rate <- sapply(bj_list_cleaned$host_response_rate, 
                                         function(x) gsub('%','',x))

4. Create current time format for reviews and calendar data

# timestamp clean into lubricate version for reviews of boston and beijing
bos_reviews$date <- ymd(bos_reviews$date)
bos_reviews$year <- year(bos_reviews$date)
bos_reviews$wkd <- wday(bos_reviews$date,label = TRUE)
bj_reviews$date <- ymd(bj_reviews$date)
bj_reviews$year <- year(bj_reviews$date)
bj_reviews$wkd <- wday(bj_reviews$date, label=TRUE)

bos_calendar$date <- ymd(bos_calendar$date)
bos_calendar$year <- year(bos_calendar$date)
bos_calendar$month <- month(ymd(bos_calendar$date))
bos_calendar$wkd <- wday(bos_calendar$date,label = TRUE)
bj_calendar$date <- ymd(bj_calendar$date)
bj_calendar$year <- year(bj_calendar$date)
bj_calendar$wkd <- wday(bj_calendar$date, label=TRUE)
bj_calendar$month <- month(ymd(bj_calendar$date))

EDA Part 1

Graph 1

bj_reviews_yr <- bj_reviews %>%
  group_by(year) %>%  
  count(listing_id) %>% 
  arrange(desc(n)) 
bos_reviews_yr <- bos_reviews %>% 
  group_by(year) %>% 
  count(listing_id) %>% 
  arrange(desc(n)) 


# bjl %>% select(id, name, number_of_reviews) %>% arrange(desc(number_of_reviews))
# bjl %>% select(id, name, number_of_reviews) %>% arrange(number_of_reviews)
# 23437 listings have reviews for Beijing
# 23437/38814
#bosl %>% select(id, name, number_of_reviews) %>% #arrange(desc(number_of_reviews))
# 3507 listings have reviews for Boston
# 3507/3585
# the rate of review in boston is higher than beijing 

ggplot(bj_reviews_yr,aes(x=as.factor(year),y=n))+
  geom_jitter(alpha=0.3,aes(color=n)) +
  geom_smooth() +
  theme_economist()+
  scale_fill_brewer() +ylim(0,250) +labs(title = 'How popular is Airbnb in Beijing', subtitle = 'Number of reviews received for a single listing over years')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).

ggplot(bos_reviews_yr,aes(x=as.factor(year),y=n))+
  geom_jitter(alpha=0.3,aes(color=n)) +
  geom_smooth() +
  theme_economist()+
  scale_fill_brewer() +ylim(0,250) +
  labs(title = 'How popular is Airbnb in Boston', subtitle = 'Number of reviews received for a single listing over years')
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Graph 2

bos_list_cleaned_rmna <- bos_list_cleaned %>% 
  filter(!is.na(review_scores_rating)) %>% 
  filter(!is.na(host_is_superhost))

bj_list_cleaned_rmna <- bj_list_cleaned %>% 
  filter(!is.na(review_scores_rating)) %>% 
  filter(!is.na(host_is_superhost))

bos_list_cleaned_rmna$host_response_rate <- as.numeric(bos_list_cleaned_rmna$host_response_rate)
## Warning: NAs introduced by coercion
bj_list_cleaned_rmna$host_response_rate <- as.numeric(bj_list_cleaned_rmna$host_response_rate)
## Warning: NAs introduced by coercion
summary(bj_list_cleaned_rmna$host_response_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00  100.00  100.00   96.54  100.00  100.00    1894
summary(bos_list_cleaned_rmna$host_response_rate)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   99.00  100.00   97.09  100.00  100.00     286
ggplot(bos_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
  geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_economist() +labs(title = 'Indicators for SuperHost', subtitle = 'Avg. Rating by Response Rate in Boston')
## Warning: Removed 286 rows containing missing values (geom_point).

ggplot(bj_list_cleaned_rmna,aes(x=host_response_rate,y=review_scores_rating)) +
  geom_jitter(aes(color=as.factor(host_is_superhost)),alpha=0.3) +theme_economist() +
  labs(title = 'Indicators for SuperHost', subtitle = 'Avg. Rating by Response Rate in Beijing')
## Warning: Removed 1894 rows containing missing values (geom_point).

Graph 3

glimpse(bos_calendar)
## Observations: 1,280,055
## Variables: 10
## $ listing_id     <dbl> 5506, 5506, 5506, 5506, 5506, 5506, 5506, 5506, 5506...
## $ date           <date> 2019-12-04, 2019-12-05, 2019-12-06, 2019-12-07, 201...
## $ available      <lgl> FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU...
## $ price          <dbl> 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, ...
## $ adjusted_price <chr> "$79.00", "$79.00", "$79.00", "$79.00", "$79.00", "$...
## $ minimum_nights <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3...
## $ maximum_nights <dbl> 730, 730, 730, 730, 730, 730, 730, 730, 730, 730, 73...
## $ year           <dbl> 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019...
## $ month          <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, ...
## $ wkd            <ord> Wed, Thu, Fri, Sat, Sun, Mon, Tue, Wed, Thu, Fri, Sa...
dim(bos_calendar)
## [1] 1280055      10
summary(bos_calendar$year)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2019    2020    2020    2020    2020    2020
avg_price_bj <- bj_calendar %>% 
  group_by(date, wkd) %>% 
  summarize(avgprice= mean(price, na.rm=T))

avg_price_bos <- bos_calendar %>% 
  group_by(date, wkd) %>% 
  summarize(avgprice=mean(price,na.rm=T))

ggplot(avg_price_bos, aes(x=wkd, y=avgprice)) + 
  geom_boxplot(na.rm=T) + geom_jitter(alpha=0.2) + 
  theme_economist() + 
  labs(title = 'Price Trends over the weekday & weekends in Boston', subtitle = 'avgprice = Avg. price by day')

ggplot(avg_price_bj,aes(x=wkd,y=avgprice)) + 
  geom_boxplot(na.rm=T) + geom_jitter(alpha=0.2) +
  theme_economist() + 
  labs(title = 'Price Trends over the weekday & weekends in Beijing', subtitle = 'avgprice = Avg. price by day')

Graph 4

avg_price_bj_2020 <- bj_calendar %>% 
  filter(year == 2020) %>% 
  group_by(listing_id, month) %>% 
  summarize(avgprice= mean(price,na.rm=T))

avg_price_bos_2020 <- bos_calendar %>% 
  filter(year == 2020) %>% 
  group_by(listing_id, month) %>% 
  summarize(avgprice=mean(price,na.rm=T))

ylim_bos<-boxplot.stats(avg_price_bos_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bos_2020, aes(x = factor(month), y=avgprice)) + 
  geom_boxplot(outlier.shape = NA) + 
  coord_cartesian(ylim = ylim_bos * 1.5) +
  theme_economist() + 
  labs(title = 'Price Trends over the month in Boston', subtitle = 'avgprice = Avg. price by listing')
## Warning: Removed 681 rows containing non-finite values (stat_boxplot).

ylim_bj<-boxplot.stats(avg_price_bj_2020$avgprice)$stats[c(1, 5)]
ggplot(avg_price_bj_2020, aes(x = factor(month), y=avgprice)) + 
  geom_boxplot(outlier.shape = NA) + 
  coord_cartesian(ylim = ylim_bos * 2.4) +
  theme_economist() + 
  labs(title = 'Price Trends over the month in Beijing', subtitle = 'avgprice = Avg. price by listing')
## Warning: Removed 51194 rows containing non-finite values (stat_boxplot).

Graph 5

bosAirbnb <- bos_list_cleaned %>% 
  mutate(Log1pPrice = log1p(price), transformed_review = bos_list_cleaned$review_scores_rating^5)

bosAirbnb <- bosAirbnb %>% select(-price, -review_scores_rating)

pal <- colorNumeric(palette = rainbow(6), domain = bosAirbnb$Log1pPrice)

leaflet(data = bosAirbnb[is.na(bosAirbnb$Log1pPrice)==FALSE,]) %>%  
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(~longitude, ~latitude, 
                   color = ~pal(Log1pPrice), weight = 1, radius=1.5, 
                   fillOpacity = 1, opacity = 1,
                   label = paste("Neighbourhood:", bosAirbnb$neighbourhood_cleansed)) %>% 
  addLegend("bottomright", pal = pal, values = ~Log1pPrice,
            title = "Log1pPrice",
            opacity = 1)
BJAirbnb <- bj_list_cleaned %>% 
  mutate(Log1pPrice = log1p(price), transformed_review = bj_list_cleaned$review_scores_rating^5)
BJAirbnb <- BJAirbnb %>% select(-price, -review_scores_rating)

pal <- colorNumeric(palette = rainbow(6), domain = BJAirbnb$Log1pPrice)

leaflet(data = BJAirbnb[is.na(BJAirbnb$Log1pPrice)==FALSE,]) %>%  
  addProviderTiles(providers$CartoDB.Positron) %>% 
  addCircleMarkers(~longitude, ~latitude, 
                   color = ~pal(Log1pPrice), weight = 1, radius=1.5, 
                   fillOpacity = 1, opacity = 1,
                   label = paste("Neighbourhood:", BJAirbnb$neighbourhood_cleansed)) %>% 
  addLegend("bottomright", pal = pal, values = ~Log1pPrice,
            title = "Log1pPrice",
            opacity = 1)